Intro
This report summarises the R-code used to derive the socio-demographic index (SDI) covariate for the GPW13 baseline and projection stream of work. It also provides the input data and generated estimates of SDI together with plots showing the variation of the estimates by country, region and over time. The SDI composite variable generated here is a version of the Socio-demographic Index (SDI) that was originally developed for the 2015 Global Burden of Disease study. It is a development status indicator comprising of components that are strongly correlated with health outcomes and is calculated as the geometric mean of normalised 0 to 1 indices of total fertility rate, mean years of schooling for those aged 15 and older, and lagged income per capita. Although there are SDI versions generated for the GBD2019 study, the input data used there are not publically available e.g. fertility. Thus, we construct the metric using publicly available fertility data from UNDESA World Population Prospects 2019 (WPP2019), as well as GDP per capita and education data from the UNDP Human Development Reports
Input data and SDI calculation
The code below reads in the income, education and fertility data mentioned above. First we define a set of core functions that will be used in the analysis.
#################################################################################################################
# load packages
pacman::p_load(dplyr, tidyr, data.table, pspline, ggplot2, plotly, INLA, countrycode)
# Useful functions
# Not in
'%!in%' <- function(x,y)!('%in%'(x,y))
# logit and inverse logit functions to bound estimates 0 - 1
logit <- function(x){
x[x <= 1e-5] = 1e-5; x[x >= 1-1e-4] = 1-1e-4
qlogis(log(x), log.p = TRUE)
}
inv.logit <- function(x){
exp(plogis(x, log.p = TRUE))
}
#################################################################################################################Next we read in the data, smooth it and generate the SDI variable:
#################################################################################################################
# Location data by location name, iso3 and UN country code
oceania <- c("Australia and New Zealand","Melanesia", "Micronesia", "Polynesia")
loc_label<- fread("data/name_code_iso3.csv") %>%
mutate(region = countrycode(iso3, "iso3c", "region"),
region = ifelse(region %in% oceania, "Oceania", region))
#################################################################################################################
# Read in the data to be used (education, TFR and GDP)
# Raw data
educ.r <- fread("data/educ.csv", header=T)
tfr.r <- fread("data/fert.csv", header=T)
gdp.r <- fread("data/gdppc.csv", header=T)
#################################################################################################################
# Function to smooth the historical data for random fluctuations
smoother <- function(df, var, lagy){
dfx <- df %>%
mutate(location_name = NULL, year = year + lagy) %>%
filter(iso3 %in% unique(loc_label$iso3) & year > 1989) %>%
setnames(., var, "y")
dfy <- data.table(iso3=character(), year=integer(), y=integer())
for (is in unique(dfx$iso3)){
dfz <- dfx %>% filter(iso3 == is) %>% arrange(year)
if (nrow(dfz) < 2){
dfy <- rbind(dfy, dfz)
} else {
xi = dfz$year; yi = dfz$y; xp = min(xi):max(xi)
yp <- predict(smooth.Pspline(xi, yi, spar = 1), xp)[,1]
dfy <- rbind(dfy, data.table(iso3 = is, year = xp, y = yp))
}
}
dfy %>% setnames(., "y", var) %>% arrange(iso3, year)
}
#################################################################################################################
#Smoothed data
educ <- educ.r %>% smoother(.,"educ",0)
tfr <- tfr.r %>% smoother(.,"tfr",0)
gdp <- gdp.r %>% smoother(.,"gdp",5) %>% mutate(ldi=log(gdp), gdp=NULL)
#################################################################################################################
# Bring it all together, transform variables to 0-1 scale, calculate SDI
sdi.vars <- educ %>%
left_join(tfr, by = c("iso3", "year")) %>%
left_join(gdp, by = c("iso3", "year")) %>%
left_join(loc_label, by = "iso3") %>%
select(iso3, year, ldi, tfr, educ, region) %>%
filter(!(is.na(ldi)|is.na(tfr)|is.na(educ))) %>%
mutate(educ.t = (educ - min(educ))/(max(educ) - min(educ)),
ldi.t = (ldi - min(ldi))/(max(ldi) - min(ldi)),
tfr.t = 1 - (tfr - min(tfr))/(max(tfr) - min(tfr))) %>%
rowwise() %>%
mutate(sdi.pub = prod(educ.t, ldi.t, tfr.t)^(1/3)) %>% ungroup() %>%
filter(year > 1999)
sdi.data <- list(loc_label=loc_label, educ.r=educ.r, tfr.r=tfr.r, gdp.r=gdp.r,
educ=educ, tfr=tfr, gdp=gdp, sdi.vars=sdi.vars)The raw SDI input data, smoothed intermediate files and calculated SDI can be downloaded as an R-database using the sdi.data.RDa link below:
A number of countries are missing at least one input and thus an SDI cannot be estimated for those countries directly. These countries are listed below:
#################################################################################################################
# List all the countries without an SDI
missing.sdi <- loc_label %>% filter(iso3 %!in% unique(sdi.vars$iso3))
missing.sdi %>% pull(location_name) %>% unique()## [1] "Andorra" "Antigua and Barbuda"
## [3] "The Bahamas" "Belize"
## [5] "Brunei" "Bhutan"
## [7] "Cook Islands" "Dominica"
## [9] "Eritrea" "Fiji"
## [11] "Federated States of Micronesia" "Grenada"
## [13] "Guyana" "Kiribati"
## [15] "Saint Kitts and Nevis" "Monaco"
## [17] "Maldives" "Marshall Islands"
## [19] "Niue" "Nauru"
## [21] "Palau" "Papua New Guinea"
## [23] "North Korea" "Solomon Islands"
## [25] "San Marino" "Somalia"
## [27] "South Sudan" "Suriname"
## [29] "Slovakia" "Timor-Leste"
## [31] "Tonga" "Tuvalu"
## [33] "Saint Vincent and the Grenadines" "Vanuatu"
## [35] "Samoa"
For these missing countries we can impute the SDI according to a regression model. As mentioned previously, a version of the SDI has been generated for the GBD2019 using other data sources that are not publically available. The GBD2019 SDI (sdi.gbd) and the SDI from the public data (sdi.pub) can be compared:
#################################################################################################################
# sdi estimate according to GBD2019
sdi.gbd.r <- fread("data/sdigbd2019.csv", header = T)
sdi.gbd <- sdi.gbd.r %>%
gather(year, sdi.gbd, -location_name) %>%
mutate(year = as.numeric(year)) %>% filter(year > 1999) %>%
right_join(loc_label, by = "location_name")
#################################################################################################################
# Data frame with both GBD and public SDI and where possible, demographic ind
sdi.mod.df <- sdi.vars %>%
select(iso3, year, sdi.pub) %>%
right_join(sdi.gbd, by = c("iso3","year")) %>%
filter(!(iso3 %!in% missing.sdi$iso3 & is.na(sdi.pub))) %>%
select(location_name, iso3, region, year, sdi.pub, sdi.gbd) %>%
arrange(iso3, year) To impute the SDI in the 35 locations listed above, a simple linear regression model is fit to sdi.pub using the sdi.gbd as well as region.
#################################################################################################################
# Regression model to predict public SDI from GBD SDI
sdi.mod <- lm(sdi.pub ~ sdi.gbd + region - 1, data = sdi.mod.df)
summary(sdi.mod)##
## Call:
## lm(formula = sdi.pub ~ sdi.gbd + region - 1, data = sdi.mod.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.202446 -0.016619 -0.000869 0.018275 0.112647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## sdi.gbd 0.988359 0.005766 171.423 < 2e-16 ***
## regionCaribbean 0.056952 0.004422 12.880 < 2e-16 ***
## regionCentral America 0.058368 0.004108 14.208 < 2e-16 ***
## regionCentral Asia 0.080449 0.004772 16.858 < 2e-16 ***
## regionEastern Africa 0.020649 0.002886 7.154 1.07e-12 ***
## regionEastern Asia 0.037543 0.005446 6.894 6.68e-12 ***
## regionEastern Europe 0.055434 0.004828 11.481 < 2e-16 ***
## regionMiddle Africa 0.003468 0.003268 1.061 0.2888
## regionNorthern Africa 0.001277 0.004313 0.296 0.7672
## regionNorthern America 0.047637 0.007006 6.800 1.28e-11 ***
## regionNorthern Europe 0.022690 0.005267 4.308 1.70e-05 ***
## regionOceania 0.041141 0.006893 5.968 2.70e-09 ***
## regionSouth-Eastern Asia 0.032022 0.004021 7.964 2.39e-15 ***
## regionSouth America 0.053459 0.004144 12.900 < 2e-16 ***
## regionSouthern Africa 0.034847 0.004484 7.771 1.08e-14 ***
## regionSouthern Asia 0.060783 0.003694 16.456 < 2e-16 ***
## regionSouthern Europe 0.030797 0.004702 6.550 6.82e-11 ***
## regionWestern Africa 0.001810 0.002571 0.704 0.4816
## regionWestern Asia 0.006911 0.004252 1.625 0.1042
## regionWestern Europe 0.013522 0.005602 2.414 0.0158 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03049 on 2812 degrees of freedom
## (700 observations deleted due to missingness)
## Multiple R-squared: 0.9977, Adjusted R-squared: 0.9977
## F-statistic: 6.121e+04 on 20 and 2812 DF, p-value: < 2.2e-16
Based on the model diagnostics and assessing the model fit by comparing the fitted to the observed values, we see that in general, the model fits the data well.
The model is used to impute SDI where missing:
#################################################################################################################
# Imputing SDI using the linear regression model on sdi.gbd
sdi.out <- sdi.mod.df %>%
mutate(preds = predict(sdi.mod, sdi.mod.df),
sdi.pub = ifelse(is.na(sdi.pub), preds, sdi.pub)) %>%
select(location_name, iso3, year, sdi.pub, region) Forecasting the SDI covariate
The goal being to potentially forecast indicator values for the GPW13 and where missing, to impute them using the SDI as a covariate, an initial step is to complete the SDI time-series to cover the entire range relevant to the GPW13. This is accomplished by forecasting the SDI for the years applicable to the GPW13 (also back-casting where historical data are non-existent). In this case we forecast the SDI time-series using a random walk (RW2) model which has the density \[\pi(y) \propto exp\left(-\frac{1}{2}\sum_{i=3}^n (y_i - 2y_{i-1} + y_{i-2})^2\right)\] where the term \(y_{i+1} −2y_i +y_{i−1}\) can be interpreted as an estimate of the second order derivative of a continuous time function \(y(t)\) at \(t = i\) using values of \(y(t)\) at \(t = i − 1\), \(i\), and \(i+1\).
#################################################################################################################
# Function to forecast a series using time-series random walk
forecast.series <- function(dmod){
result <- inla(sdi.pub ~ f(x, model = "rw2"), data=dmod,
control.predictor= list(compute=TRUE))
tibble(pred = result$summary.fitted.values$mean,
lwr = result$summary.fitted.values$"0.025quant",
uppr = result$summary.fitted.values$"0.975quant")
}
#################################################################################################################
# Loop performing forecast by country
isg <- unique(sdi.out$iso3)
out <- list(dim = length(isg))
dum <- tibble(x = 1:26, sdi.pub = NA)
for (ig in 1:length(isg)){
covar.db <- sdi.out %>% filter(iso3 == isg[ig]) %>% arrange(year)
cov.lab <- covar.db %>% select(location_name, iso3, region) %>% distinct()
dmod <- covar.db %>% mutate(x = year - 1999) %>% select(x, sdi.pub)
dmod <- dmod %>% rbind(dum %>% filter(x %!in% dmod$x)) %>% arrange(x)
preds <- forecast.series(dmod)
out[[ig]] <- data.table(cov.lab, year=2000:2025, sdi.pub=dmod$sdi.pub, preds)
}
forecasted.sdi <- rbindlist(out)In addition to SDI, we also take life-expectancy as a potential covariate. From the UNPOP division WPP2019, life-expectancy at birth has aleady been forecasted for 183 of the 194 member states. We read in the data:
#################################################################################################################
# life-expactancy indicator from WPP2019
e0.r <- fread("data/demogind.csv")
e0.in <- e0.r %>%
right_join(loc_label, by = "country_code") %>%
filter(!(Variant =="Medium variant" & year ==2020)) %>%
filter(year %in% 2000:2025) %>% distinct() %>%
select(iso3, year, e0) And also provide a list the countries missing \(e_0\):
missing.e0 <- loc_label %>% filter(iso3 %!in% unique(e0.in$iso3))
missing.e0 %>% pull(location_name) %>% unique()## [1] "Andorra" "Cook Islands" "Dominica"
## [4] "Saint Kitts and Nevis" "Monaco" "Marshall Islands"
## [7] "Niue" "Nauru" "Palau"
## [10] "San Marino" "Tuvalu"
We can impute life-expectancy using the SDI, again utilizing a basic linear model (also noting that we are effectively imposing a collinearity for the countries missing both SDI and \(e_0\)):
#################################################################################################################
# Predict e0 using the SDI and impute for the missing 11 locations
cov.in <- forecasted.sdi %>% left_join(e0.in, by=c("iso3","year")) %>% rename(sdi=pred)
m.e0 <- lm(e0 ~ sdi + region - 1, data = cov.in)
p.e0 <- predict(m.e0, cov.in)
summary(m.e0)##
## Call:
## lm(formula = e0 ~ sdi + region - 1, data = cov.in)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.6468 -2.0600 0.3273 2.2228 10.5779
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## sdi 34.6907 0.4616 75.15 <2e-16 ***
## regionCaribbean 49.3217 0.3817 129.21 <2e-16 ***
## regionCentral America 53.0632 0.3751 141.47 <2e-16 ***
## regionCentral Asia 45.4072 0.4406 103.05 <2e-16 ***
## regionEastern Africa 47.3138 0.2492 189.84 <2e-16 ***
## regionEastern Asia 50.5474 0.4542 111.28 <2e-16 ***
## regionEastern Europe 45.9588 0.4283 107.30 <2e-16 ***
## regionMiddle Africa 43.3010 0.2997 144.47 <2e-16 ***
## regionNorthern Africa 51.0839 0.3591 142.27 <2e-16 ***
## regionNorthern America 49.2882 0.6387 77.16 <2e-16 ***
## regionNorthern Europe 49.5092 0.4492 110.22 <2e-16 ***
## regionOceania 49.7307 0.3574 139.14 <2e-16 ***
## regionSouth-Eastern Asia 49.8201 0.3544 140.58 <2e-16 ***
## regionSouth America 50.6031 0.3677 137.62 <2e-16 ***
## regionSouthern Africa 35.6946 0.4137 86.27 <2e-16 ***
## regionSouthern Asia 51.7463 0.3341 154.88 <2e-16 ***
## regionSouthern Europe 51.9013 0.4073 127.44 <2e-16 ***
## regionWestern Africa 47.1091 0.2331 202.07 <2e-16 ***
## regionWestern Asia 50.7357 0.3613 140.43 <2e-16 ***
## regionWestern Europe 51.0909 0.4773 107.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.543 on 4738 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.9975, Adjusted R-squared: 0.9975
## F-statistic: 9.486e+04 on 20 and 4738 DF, p-value: < 2.2e-16
Finally, we bring all the files together to create the covariate database complete for all countries and years 2000 to 2025:
#################################################################################################################
# Covariates data-frame
covariates <- cov.in %>%
mutate(p.e0s = p.e0, e0 = ifelse(is.na(e0), p.e0s, e0)) %>%
select(location_name, iso3, year, region, sdi.pub, sdi, lwr, uppr, e0) %>% distinct() %>%
filter(year %in% 2000:2025) %>% arrange(iso3, year) %>%
mutate(SDI.class = ifelse(iso3 %in% missing.sdi$iso3, "Imputed", "Public data"),
e0.class = ifelse(iso3 %in% missing.e0$iso3, "Imputed", "Public data"))
#################################################################################################################
# List to download
covariate.data <- list(sdi.gbd.r=sdi.gbd.r, sdi.gbd=sdi.gbd, sdi.mod.df=sdi.mod.df, sdi.out=sdi.out,
forecasted.sdi=forecasted.sdi,
e0.r=e0.r, e0.in = e0.in, cov.in = cov.in, covariates= covariates)All of the covariate raw and intermediate files and the final covariates dataframe can be downloaded as an R-database using the covariate.data.RDa link below:
The plot below illustrates the estimated and forecasted SDI by country:
Similar to the plots above, the plots below show the time-series of life-expectancy at birth \(e_0\).